runZinb = F
runClus = F
NCORES = 2
This document is a temptative of workflow with the following steps
Along the worflow, use deviance residuals as adjusted “counts”.
I am still using the data Davide gave me a while ago. And I think now it is not the same data Davide is using in its most recent vignette (that is zinb_clustering_over_k.Rmd). It should not be very important but it would be nice if we have a common dataset we can work on.
load("../data/Expt4c2b_filtdata.Rda")
load("../data/E4c2b_slingshot_wsforkelly.RData")
Here we only look at the 1000 most variable genes.
names(batch) <- colnames(counts)
counts <- counts[,names(clus.labels)]
batch <- droplevels(batch[names(clus.labels)])
qc <- qc[names(clus.labels),]
vars <- rowVars(log1p(counts))
names(vars) <- rownames(counts)
vars <- sort(vars, decreasing = TRUE)
core <- counts[names(vars)[1:1000],]
We have 616 cells.
dim(core)
## [1] 1000 616
core[1:3, 1:3]
## OEP01_N706_S501 OEP01_N701_S501 OEP01_N707_S507
## Gstm1 8763 7221 3581
## Cbr2 5799 3638 1448
## Cyp2f2 2158 2027 1078
Cells have been processed in 18 different batches
table(batch)
## batch
## GBC08A GBC08B GBC09A GBC09B P01 P02 P03A P03B P04 P05
## 36 33 30 17 25 43 37 33 14 20
## P06 P10 P11 P12 P13 P14 Y01 Y04
## 39 36 44 40 47 39 51 32
Cells have been clustered into 13 different clusters
table(clus.labels)
## clus.labels
## 1 2 3 4 5 7 8 9 10 11 12 14 15
## 91 25 56 40 96 60 28 79 26 22 35 26 32
Batches are kind of confounded with the biology
table(data.frame(batch = as.vector(batch),
cluster = clus.labels))
## cluster
## batch 1 2 3 4 5 7 8 9 10 11 12 14 15
## GBC08A 0 2 12 9 0 0 0 0 0 2 0 2 9
## GBC08B 0 7 5 3 0 0 0 1 2 4 0 5 6
## GBC09A 0 1 5 9 0 0 0 1 1 0 0 6 7
## GBC09B 0 2 2 7 0 0 0 3 0 0 0 3 0
## P01 0 2 4 3 15 1 0 0 0 0 0 0 0
## P02 2 0 9 3 15 3 3 2 3 0 2 1 0
## P03A 3 0 3 0 12 2 9 4 2 0 2 0 0
## P03B 1 2 1 1 11 1 2 10 1 1 2 0 0
## P04 0 0 0 0 11 1 0 1 1 0 0 0 0
## P05 0 0 0 1 11 3 0 1 0 2 2 0 0
## P06 1 2 3 0 8 2 4 8 4 1 2 2 2
## P10 3 1 4 0 4 5 9 2 0 2 5 0 1
## P11 2 1 1 0 1 5 1 22 3 1 6 0 1
## P12 0 2 0 0 4 10 0 8 2 3 6 4 1
## P13 1 2 4 0 4 15 0 4 5 6 1 3 2
## P14 0 0 1 2 0 12 0 12 2 0 7 0 3
## Y01 47 1 1 2 0 0 0 0 0 0 0 0 0
## Y04 31 0 1 0 0 0 0 0 0 0 0 0 0
compute.naive.residuals <- function(Y, zinb){
mu_hat = t(getMu(zinb))
pi_hat = t(getPi(zinb))
Y_hat = (1 - pi_hat) * mu_hat
Y - Y_hat
}
compute.pearson.residuals <- function(Y, zinb){
num = compute.naive.residuals(Y, zinb)
mu = t(getMu(zinb))
pi = t(getPi(zinb))
phi = matrix(rep(getPhi(zinb), ncol(Y)), ncol = ncol(Y))
var_hat = (1 - pi) * mu * (1 + mu * (phi + pi))
num / sqrt(var_hat)
}
compute.zinb.loglik <- function(Y, zinb){
mu = t(getMu(zinb))
theta = getTheta(zinb)
theta_mat = matrix(rep(theta, ncol(Y), ncol = ncol(Y)))
pi = t(getPi(zinb))
log( pi * (Y == 0) + (1 - pi) * dnbinom(Y, size = theta, mu = mu) )
}
compute.deviance.residuals <- function(Y, zinb){
mu_hat = t(getMu(zinb))
pi_hat = t(getPi(zinb))
Y_hat = (1 - pi_hat) * mu_hat
ll = compute.zinb.loglik(Y, zinb)
sign = 1*(Y - Y_hat > 0)
sign[sign == 0] = -1
sign * sqrt(-2 * ll)
}
Let’s run zinbwave with K = 0 and X batches.
fn = '../data/zinb_k0_batch.rda'
if (runZinb){
mod = model.matrix( ~ batch)
print(system.time(zinb0 <- zinbFit(core, K = 0, X = mod, ncores = NCORES)))
save(zinb0, file = fn)
}else{
load(fn)
}
We use deviance residuals for visualization. Let’s check that deviance residuals look ok.
res = compute.deviance.residuals(core, zinb0)
res[1:3,1:3]
## OEP01_N706_S501 OEP01_N701_S501 OEP01_N707_S507
## Gstm1 4.637339 4.591452 -4.440574
## Cbr2 4.552445 -4.441892 -4.248677
## Cyp2f2 4.327763 4.310824 -4.159757
PCA on the residuals where color are for batches on the left and previously found clusters on the right.
pca = prcomp(t(res))
par(mfrow = c(1,2))
plot(pca$x, col = batch, pch = 19,
main="PCA of residuals\ncolor=batch")
plot(pca$x, col = clus.labels, pch = 19,
main = "PCA of residuals\ncolor=cluster")
par(mfrow = c(1,1))
Boxplot of the residuals for each cell.
res_order = res[, order(as.numeric(batch))]
col_order = as.numeric(batch)[order(as.numeric(batch))]
boxplot(res_order, main='Boxplot of residuals\ncolor=batch', col = col_order, staplewex = 0, outline = 0, border = col_order, xaxt = 'n')
Deviance residuals seem ok.
Let’s run zinbwave with K = 50 and X including batches.
fn = '../data/zinb_k50_batch.rda'
if (runZinb){
mod = model.matrix( ~ batch)
print(system.time(zinb50 <- zinbFit(core, ncores = NCORES, K = 50,
X = mod)))
save(zinb50, file = fn)
}else{
load(fn)
}
I use the code of Davide from zinb_clutering_over_k.Rmd
## matching the parameters used by Russell
W = zinb50@W
fn = '../data/clustExp_W_k50.rda'
if(runClus) {
cl_res <- clusterMany(t(W), ks = 4:15, alphas = c(0.1), betas = 0.8,
clusterFunction = "hierarchical01", minSizes=1,
sequential = TRUE, subsample = TRUE, ncores = 7,
subsampleArgs = list(resamp.num=100,
clusterFunction="kmeans",
clusterArgs=list(nstart=10)),
seqArgs = list(k.min=3, top.can=5), verbose=TRUE)
save(cl_res, file = fn)
} else {
load(fn)
}
combined <- combineMany(cl_res, proportion = 0.7)
## Note: no clusters specified to combine, using results from clusterMany
## Warning in .makeColors(clusters, colors = bigPalette): too many clusters to
## have unique color assignments
plotClusters(combined, colPalette = c(bigPalette, rainbow(100)))
plotCoClustering(combined)
## Warning in .makeColors(clusters, colors = bigPalette): too many clusters to
## have unique color assignments
table(primaryClusterNamed(combined))
##
## -1 c1 c2 c3 c4 c5 c6 c7 c8 c9
## 142 92 5 92 74 12 108 46 30 15
sum(primaryCluster(combined) == -1)
## [1] 142
plot(getW(zinb50), col=clus.labels, main = 'color = previous')
plot(getW(zinb50), col=as.factor(primaryCluster(combined)), main = 'color = new')
table(data.frame(previous = clus.labels, new = primaryCluster(combined)))
## new
## previous -1 1 2 3 4 5 6 7 8 9
## 1 40 48 2 0 1 0 0 0 0 0
## 2 2 0 0 23 0 0 0 0 0 0
## 3 1 2 0 51 1 1 0 0 0 0
## 4 4 0 0 0 21 0 0 0 0 15
## 5 42 42 3 0 1 8 0 0 0 0
## 7 10 0 0 0 49 1 0 0 0 0
## 8 28 0 0 0 0 0 0 0 0 0
## 9 4 0 0 1 1 0 73 0 0 0
## 10 1 0 0 0 0 0 0 25 0 0
## 11 5 0 0 15 0 2 0 0 0 0
## 12 0 0 0 0 0 0 35 0 0 0
## 14 4 0 0 1 0 0 0 21 0 0
## 15 1 0 0 1 0 0 0 0 30 0
combined <- makeDendrogram(combined)
plotDendrogram(combined)
merged <- mergeClusters(combined, mergeMethod = "locfdr", cutoff = 0.01)
## Note: Merging will be done on ' combineMany ', with clustering index 1
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted rates numerically 0 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted rates numerically 0 occurred
## Warning: glm.fit: fitted rates numerically 0 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted rates numerically 0 occurred
## Warning in locfdr::locfdr(tstats, plot = 0): CM estimation failed, middle
## of histogram non-normal
## Warning in .makeColors(clusters, colors = bigPalette): too many clusters to
## have unique color assignments
Just to see what it looks like, let’s look at the heatmap on the W (number of cells x K=50). Now, we would like to plot a heatmap using the residuals (number of cells x number of genes). When I call plotHeatmap using argument visualizeData = residuals_matrix, it does not work and says that if give separate visualizeData, must be of same dimensions as assay(data).
plotHeatmap(merged, clusterSamplesData="dendrogramValue", breaks = .99)
So, let’s look at a heatmap on deviance residuals using heatmap.2 from gplots package with all the 1000 most variable genes.
palDF = merged@clusterLegend[[1]]
pal = palDF[, 'color']
names(pal) = palDF[, 'name']
heatmap.2(res, Colv = merged@dendro_samples,
scale = 'none', trace = 'none',
col = colorRampPalette(c('blue', 'yellow'))(51),
ColSideColors = pal[primaryClusterNamed(merged)],
labCol = '', main = 'Deviance Residuals, 1000 most variable genes')
legend("bottom", legend = names(pal), fill = pal, title = 'Clusters',
horiz = T, cex = 0.5)
I used the unsupervised mode on W got from fitting zinbwave with batches in X and K = 4.
fn = '../data/zinb_k4_batch.rda'
if (runZinb){
cl_labs = primaryCluster(merged)
batch_red = as.vector(batch)
batch_red = batch_red[cl_labs != -1]
cl_labs = cl_labs[cl_labs != -1]
mod = model.matrix( ~ batch_red)
zinb4 = zinbFit(core[,primaryCluster(merged) != -1],
X = mod, K = 4, ncores = NCORES)
save(zinb4, file = fn)
}else{
cl_labs = primaryCluster(merged)
batch_red = as.vector(batch)
batch_red = batch_red[cl_labs != -1]
cl_labs = cl_labs[cl_labs != -1]
mod = model.matrix( ~ batch_red)
load(fn)
}
X <- zinb4@W
lineages <- get_lineages(X, clus.labels = cl_labs, start.clus = 1)
curves <- get_curves(X, clus.labels = cl_labs, lineages = lineages)
plot_curves(X, cl_labs, curves, col.clus = unique(cl_labs))
print(lineages$lineage1)
## [1] "1" "5" "3" "7" "6"
print(lineages$lineage2)
## [1] "1" "5" "3" "8"
print(lineages$lineage3)
## [1] "1" "5" "4" "9"
print(lineages$lineage4)
## [1] "1" "2"
Which clusters would it make sense to test one versus the other? Would it be interesting to do two by two comparison between clusters next to each other in the different lineages?
DE analysis using ZINB-Wave has not been implemented yet and we are still thinking about it. Would it make sense just to test the betas between clusters making the assumption that the genes are independent? Below is just a visualization of what we could have at the end of the worflow: a heatmap of the deviance residuals for DE genes between clusters.
de = read.csv('../data/oe_markers.txt', stringsAsFactors = F, header = F)
de = de$V1
breaks = setBreaks(res[rownames(res) %in% de, ], 0.99)
heatmap.2(res[rownames(res) %in% de, ], breaks = breaks,
Colv = merged@dendro_samples,
scale = 'none', trace = 'none',
col = colorRampPalette(c('blue', 'yellow'))(51),
ColSideColors = pal[primaryClusterNamed(merged)],
labCol = '', main = 'Deviance Residuals, DE genes (Russell)')
legend("bottom", legend = names(pal), fill = pal, title = 'Clusters',
horiz = T, cex = 0.5)
sessionInfo()
## R version 3.4.0 (2017-04-21)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Sierra 10.12.4
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] clusterExperiment_1.2.0 rARPACK_0.11-0
## [3] slingshot_0.0.3-5 princurve_1.1-12
## [5] digest_0.6.12 RColorBrewer_1.1-2
## [7] scone_1.0.0 Rtsne_0.13
## [9] magrittr_1.5 gplots_3.0.1
## [11] ggplot2_2.2.1 zinbwave_0.1.4
## [13] scRNAseq_1.2.0 SummarizedExperiment_1.6.1
## [15] DelayedArray_0.2.4 matrixStats_0.52.2
## [17] Biobase_2.36.2 GenomicRanges_1.28.3
## [19] GenomeInfoDb_1.12.0 IRanges_2.10.2
## [21] S4Vectors_0.14.2 BiocGenerics_0.22.0
##
## loaded via a namespace (and not attached):
## [1] shinydashboard_0.6.0 R.utils_2.5.0
## [3] RSQLite_1.1-2 AnnotationDbi_1.38.0
## [5] htmlwidgets_0.8 grid_3.4.0
## [7] trimcluster_0.1-2 BiocParallel_1.10.1
## [9] RNeXML_2.0.7 DESeq_1.28.0
## [11] munsell_0.4.3 codetools_0.2-15
## [13] statmod_1.4.29 scran_1.4.4
## [15] DT_0.2 miniUI_0.1.1
## [17] colorspace_1.3-2 energy_1.7-0
## [19] uuid_0.1-2 knitr_1.16
## [21] pspline_1.0-17 robustbase_0.92-7
## [23] bayesm_3.0-2 NMF_0.20.6
## [25] tximport_1.4.0 GenomeInfoDbData_0.99.0
## [27] hwriter_1.3.2 rhdf5_2.20.0
## [29] rprojroot_1.2 EDASeq_2.10.0
## [31] diptest_0.75-7 R6_2.2.1
## [33] doParallel_1.0.10 ggbeeswarm_0.5.3
## [35] taxize_0.8.4 locfit_1.5-9.1
## [37] flexmix_2.3-14 reshape_0.8.6
## [39] bitops_1.0-6 assertthat_0.2.0
## [41] scales_0.4.1 nnet_7.3-12
## [43] phylobase_0.8.4 beeswarm_0.2.3
## [45] gtable_0.2.0 RUVSeq_1.10.0
## [47] bold_0.4.0 rlang_0.1.1
## [49] genefilter_1.58.1 splines_3.4.0
## [51] rtracklayer_1.36.3 lazyeval_0.2.0
## [53] hexbin_1.27.1 rgl_0.98.1
## [55] yaml_2.1.14 reshape2_1.4.2
## [57] abind_1.4-5 GenomicFeatures_1.28.0
## [59] backports_1.1.0 httpuv_1.3.3
## [61] tensorA_0.36 tools_3.4.0
## [63] gridBase_0.4-7 dynamicTreeCut_1.63-1
## [65] stabledist_0.7-1 Rcpp_0.12.11
## [67] plyr_1.8.4 progress_1.1.2
## [69] visNetwork_1.0.3 zlibbioc_1.22.0
## [71] purrr_0.2.2.2 RCurl_1.95-4.8
## [73] prettyunits_1.0.2 viridis_0.4.0
## [75] zoo_1.8-0 cluster_2.0.6
## [77] data.table_1.10.4 RSpectra_0.12-0
## [79] mvtnorm_1.0-6 whisker_0.3-2
## [81] gsl_1.9-10.3 aroma.light_3.6.0
## [83] mime_0.5 evaluate_0.10
## [85] xtable_1.8-2 XML_3.98-1.7
## [87] mclust_5.3 gridExtra_2.2.1
## [89] compiler_3.4.0 biomaRt_2.32.0
## [91] scater_1.4.0 tibble_1.3.1
## [93] KernSmooth_2.23-15 R.oo_1.21.0
## [95] htmltools_0.3.6 segmented_0.5-2.0
## [97] pcaPP_1.9-61 tidyr_0.6.3
## [99] geneplotter_1.54.0 howmany_0.3-1
## [101] DBI_0.6-1 MASS_7.3-47
## [103] fpc_2.1-10 MAST_1.2.1
## [105] boot_1.3-19 compositions_1.40-1
## [107] ade4_1.7-6 ShortRead_1.34.0
## [109] Matrix_1.2-10 R.methodsS3_1.7.1
## [111] gdata_2.17.0 igraph_1.0.1
## [113] rncl_0.8.2 GenomicAlignments_1.12.1
## [115] registry_0.3 locfdr_1.1-8
## [117] numDeriv_2016.8-1 plotly_4.6.0
## [119] xml2_1.1.1 foreach_1.4.3
## [121] annotate_1.54.0 vipor_0.4.5
## [123] rngtools_1.2.4 pkgmaker_0.22
## [125] XVector_0.16.0 stringr_1.2.0
## [127] copula_0.999-16 ADGofTest_0.3
## [129] softImpute_1.4 Biostrings_2.44.0
## [131] rmarkdown_1.5 dendextend_1.5.2
## [133] edgeR_3.18.1 kernlab_0.9-25
## [135] shiny_1.0.3 Rsamtools_1.28.0
## [137] gtools_3.5.0 modeltools_0.2-21
## [139] rjson_0.2.15 nlme_3.1-131
## [141] jsonlite_1.4 viridisLite_0.2.0
## [143] limma_3.32.2 lattice_0.20-35
## [145] httr_1.2.1 DEoptimR_1.0-8
## [147] survival_2.41-3 FNN_1.1
## [149] prabclus_2.2-6 iterators_1.0.8
## [151] glmnet_2.0-10 class_7.3-14
## [153] stringi_1.1.5 mixtools_1.1.0
## [155] latticeExtra_0.6-28 caTools_1.17.1
## [157] memoise_1.1.0 dplyr_0.5.0
## [159] ape_4.1